library(dplyr)
library(leaflet)
library(geosphere)
library(data.table)
library(ggplot2)
library(lubridate)
library(caret)
library(readxl)
library(tidyverse)
library(ggthemes)
library(ggrepel)
library(reshape)
library(reshape2)
library(devtools)
Starting with web scrapping we collected datasets from Boston Hubway and weather sites, focusing on the year 2017. We merged these datasets into a single csv file for our subsequent analysis. We performed exploratory analysis of bluebike riders in Boston, with the hopes of trying to have a general idea of who a typical Blue bike subscriber was, and how their riding pattern changed with changing weather. We then drilled down on bluebike subscribers who ride overtime, meaning more than 45 mins. Since these people pay an extra $2.5 per 30mins, and thus make bluebike company extra revenue. This informed our subsequent analysis, using multiple logistic regression techniques and Decision tree analysis. We tried to predict the typical oversubscriber and what day of the week such a rider would be out and about.
# Importing the all monthly data in 2017
dat_201701<-read.csv("201701-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201702<-read.csv("201702-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201703<-read.csv("201703-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201704<-read.csv("201704-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201705<-read.csv("201705-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201706<-read.csv("201706-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201707<-read.csv("201707-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201708<-read.csv("201708-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201709<-read.csv("201709-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201710<-read.csv("201710-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201711<-read.csv("201711-hubway-tripdata.csv",stringsAsFactors = FALSE)
dat_201712<-read.csv("201712-hubway-tripdata.csv",stringsAsFactors = FALSE)
# Combine them
dat_2017 <- rbind(dat_201701, dat_201702, dat_201703, dat_201704, dat_201705,
dat_201706, dat_201707, dat_201708, dat_201709, dat_201710,
dat_201711, dat_201712)
## age, age_cat, duration_min, year, month, month_abb, day, hour, wday, weekend
bbike <- dat_2017 %>%
mutate(birth.year = as.numeric(birth.year)) %>%
mutate(age = 2017 - birth.year) %>%
mutate(age_cat = case_when(
.$age >= 10 & .$age < 20 ~ 1,
.$age >= 20 & .$age < 30 ~ 2,
.$age >= 30 & .$age < 40 ~ 3,
.$age >= 40 & .$age < 50 ~ 4,
.$age >= 50 & .$age < 60 ~ 5,
.$age >= 60 & .$age < 70 ~ 6,
.$age >= 70 & .$age < 80 ~ 7,
.$age >= 80 ~ 8)) %>%
mutate(duration_min = tripduration / 60) %>%
mutate(year = year(starttime),
month = month(starttime),
month_abb = month(starttime, label = TRUE, abbr = TRUE),
day = day(starttime),
hour = hour(starttime),
wday = wday(starttime, label = TRUE, abbr = TRUE))
## Trip distance (km)
setDT(bbike)[ , dist_km := distGeo(matrix(c(start.station.longitude, start.station.latitude), ncol = 2),matrix(c(end.station.longitude, end.station.latitude), ncol = 2))/1000]
bbike <- as.data.frame(bbike)
## overtime (if duration_min > 45, 1, 0)
bbike <- bbike %>%
mutate(overtime = ifelse(duration_min > 45, 1, 0))
## user_start, user_end: number of users at the start/end station
bbike <- bbike %>%
group_by(start.station.id) %>%
mutate(user_start = n())
bbike <- bbike %>%
group_by(end.station.id) %>%
mutate(user_end = n())
## temp_max, temp_min, rain, snownice(snow or ice)
weather <- read_excel("boston_weather.xls")
bbike <- bbike %>%
group_by(year, month, day) %>%
left_join(., weather, by = c("year", "month", "day"))
The number of bluelike users has increased over several years. Among subscribers, if they pay $99 per year, they can use it unlimitedly. However, there is a time limit, for 45 minutes per once. We are going to figure out when they do not return their bike within 45 minutes and predict the pattern.
Membership and Ridership More than 8 million trips have been taken by Bluebikes riders since the 2011 launch (as of 12/2018) An estimated 87,000 unique riders took trips in 2016
bbike_summary <- read_excel("bluebike_summary.xlsx")
bbike_summary <- as.data.frame(bbike_summary)
plot <- melt(bbike_summary, id.vars = "year") %>%
filter(variable == "subscriber" | variable == "customer")
plot %>%
ggplot(aes(year, value)) +
geom_bar(aes(fill = variable), stat = "identity", position = "stack") +
scale_fill_manual(values = c("#FF8F1C", "#0050B5")) +
scale_x_continuous(breaks = c(2011, 2012, 2013, 2014, 2015, 2016, 2017)) +
xlab("Year") +
ylab("Riders") +
ggtitle("Number of Riders since 2011") +
theme_bw()
bbike_summary %>%
ggplot(aes(year, total_trips)) +
geom_point(color = "#FF8F1C", size = 2) +
geom_line(color = "#1D428A", alpha = 0.7) +
xlab("Year") +
ylab("Numbers") +
ggtitle("Total Number of Trips since 2011") +
theme_bw()
bbike %>%
mutate(gender = as.factor(gender)) %>%
filter(gender != "0") %>% #1 is male #2 is female #0 is unknown
ggplot(aes(age_cat)) +
geom_bar(aes(fill = gender)) +
scale_fill_brewer( palette = "Oranges")+
xlab("Age category (10s, 20s, 30s etc)") +
ylab("Number of rides") +
ggtitle("Total Number of Trips by gender by age") +
theme_bw()
# Create data frame
dat_station<- read.csv("Hubway_Stations_as_of_July_2017.csv")
dat_2017_station <- bbike %>%
filter(!birth.year=="\\N")%>%
filter(birth.year>1900 & birth.year<=2017)%>%
group_by(start.station.id) %>% summarize(number = n(), start.station.latitude=first(start.station.latitude), start.station.longitude=first(start.station.longitude),start.station.name=first(start.station.name))%>% filter(start.station.latitude>0)
summary(dat_2017_station)
## start.station.id number start.station.latitude
## Min. : 1.00 Min. : 4 Min. :42.30
## 1st Qu.: 54.75 1st Qu.: 1794 1st Qu.:42.34
## Median :108.50 Median : 4750 Median :42.36
## Mean :111.91 Mean : 5625 Mean :42.36
## 3rd Qu.:173.25 3rd Qu.: 7614 3rd Qu.:42.37
## Max. :232.00 Max. :35702 Max. :42.41
## start.station.longitude start.station.name
## Min. :-71.17 Length:196
## 1st Qu.:-71.11 Class :character
## Median :-71.08 Mode :character
## Mean :-71.09
## 3rd Qu.:-71.06
## Max. :-71.01
# Distinguish stations by color based on the number of users
getColor <- function(df){
sapply(df$number, function(number) {
if(number %in% 1:4000) {
"green"
} else if(number %in% 4001:10000) {
"orange"
} else if(number >10000) {
"red"
} else {
"blue"
} })
}
icons <- awesomeIcons(
icon = 'ios-close',
iconColor = 'black',
library = 'ion',
markerColor = getColor(dat_2017_station)
)
leaflet(dat_2017_station) %>% addTiles() %>%
addAwesomeMarkers(~start.station.longitude
, ~start.station.latitude, icon=icons, label=~as.character(number), popup = ~start.station.name)
bbike %>% ggplot(aes(hour))+
geom_bar()+
scale_x_continuous(breaks=(c(0,4,8,12,16,20)))+
xlab("Hour")+
ylab("Number of users") +
facet_wrap(~ wday)
bbike %>% ggplot(aes(hour))+
geom_bar(aes(fill = factor(gender)))+
scale_x_continuous(breaks=(c(0,4,8,12,16,20)))+
xlab("Hour")+
ylab("Number of users") +
facet_wrap(~ month) +
scale_fill_discrete("Gender", labels = c("Unknown", "Male", "Female"))
bbike_member <- bbike %>%
filter(usertype == "Subscriber")
summary(bbike_member$duration_min)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.02 6.10 9.78 13.31 15.48 61276.80
As you can see, the data seems to have wrong information. The very long tripduration might be attributable to lost or other errors. Therefore, we limit the range from 0 to 75 minutes for the duration in this study.
bbike_member <- bbike_member %>%
filter(duration_min < 50)
bbike_member %>%
mutate(group = ifelse(rain == 0, "no rain", "rain")) %>%
ggplot(aes(duration_min, y = ..count.., fill = group)) +
geom_density(alpha = 0.2) +
xlab("Trip Duration (min)") +
ylab("Riders")
bbike_member %>%
mutate(gender = as.factor(gender)) %>%
ggplot(aes(duration_min, y = ..count.., fill = gender)) +
geom_density(alpha = 0.2) +
xlab("Trip Duration (min)") +
ylab("Riders")
# Distance
bbike %>%
filter(birth.year > 1900 & birth.year <= 2017)%>%
group_by(age_cat) %>%
summarize(avg = mean(dist_km), se = sd(dist_km) / sqrt(n())) %>%
ggplot(aes(age_cat, avg))+
geom_errorbar(aes(ymin = avg - 2*se, ymax = avg+2 * se), width = 0.2)+
geom_point(color = "#FF8F1C")+
geom_line(color = "#1D428A")+
scale_x_continuous(breaks=(c(1,2,3,4,5,6,7,8)), labels=c("10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-"))+
xlab(expression(paste(Age, " (years)")))+
ylab(expression(paste(Distance," (km)"))) +
theme_bw()
There is no consistent trend between age and trip distance.
bbikemember <- bbike %>%
ungroup() %>%
filter(usertype == "Subscriber" & duration_min < 75) %>%
mutate(gender = as.factor(gender), day = as.factor(day),
hour = as.factor(hour)) %>%
mutate(rain_cat = ifelse(rain == 0, 0, 1)) %>%
mutate(snownice_cat = ifelse(snownice == 0, 0, 1)) %>%
mutate(overtime = ifelse(duration_min < 15, 0, 1)) %>%
mutate(overtime = as.factor(overtime)) %>%
mutate(satsun = ifelse(wday %in% c("Sat", "Sun"), 1, 0)) %>%
mutate(satsun = as.factor(satsun))
set.seed(1)
library(caret)
Train <- createDataPartition(bbikemember$overtime, p=0.5, list=FALSE)
training <- bbikemember[ Train, ]
testing <- bbikemember[ -Train, ]
bbikemember$overtime = as.factor(as.numeric(as.character(bbikemember $overtime)))
#overall accuracy
p = 0.358 #290235/810623
y_hat <- sample(c("0","1"), length(testing), replace = TRUE, prob=c(p, 1-p)) %>%
factor(levels = levels(bbikemember$overtime))
mean(y_hat == testing$overtime)
## Warning in `==.default`(y_hat, testing$overtime): longer object length is
## not a multiple of shorter object length
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## [1] 0.4254289
#logistic regression
glm.fit <- training %>%
glm(overtime ~ gender + age + month + satsun + rain_cat +
snownice_cat + user_start + user_end, data=., family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = overtime ~ gender + age + month + satsun + rain_cat +
## snownice_cat + user_start + user_end, family = "binomial",
## data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5008 -0.8277 -0.7032 1.3467 2.6214
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.371e-02 4.494e-02 1.418 0.156
## gender1 -7.974e-01 4.228e-02 -18.860 < 2e-16 ***
## gender2 -4.754e-01 4.254e-02 -11.175 < 2e-16 ***
## age 6.794e-03 2.698e-04 25.182 < 2e-16 ***
## month 6.075e-03 1.242e-03 4.891 1.01e-06 ***
## satsun1 2.427e-01 7.845e-03 30.933 < 2e-16 ***
## rain_cat -9.405e-02 6.836e-03 -13.758 < 2e-16 ***
## snownice_cat -6.017e-01 4.122e-02 -14.597 < 2e-16 ***
## user_start -2.917e-05 4.176e-07 -69.851 < 2e-16 ***
## user_end -2.679e-05 3.878e-07 -69.086 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 633720 on 549482 degrees of freedom
## Residual deviance: 615384 on 549473 degrees of freedom
## (947 observations deleted due to missingness)
## AIC: 615404
##
## Number of Fisher Scoring iterations: 4
#prediction
p_hat <- predict(glm.fit, newdata=testing, type="response")
y_hat <- ifelse(p_hat > 0.5, 1, 0) %>% factor()
#confusion matrix
table(predicted = y_hat, actual = testing$overtime)
## actual
## predicted 0 1
## 0 404103 144268
## 1 601 518
confusionMatrix(data = factor(y_hat), reference = factor(testing$overtime))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 404103 144268
## 1 601 518
##
## Accuracy : 0.7364
## 95% CI : (0.7352, 0.7375)
## No Information Rate : 0.7365
## P-Value [Acc > NIR] : 0.601
##
## Kappa : 0.0031
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.998515
## Specificity : 0.003578
## Pos Pred Value : 0.736915
## Neg Pred Value : 0.462913
## Prevalence : 0.736508
## Detection Rate : 0.735415
## Detection Prevalence : 0.997964
## Balanced Accuracy : 0.501046
##
## 'Positive' Class : 0
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
model <- training %>%
glm(overtime ~ gender + age + month + satsun + rain_cat +
snownice_cat + user_start + user_end, data=., family = "binomial")
#First example
predpr <- predict(model, newdata=testing, type="response")
roccurve<- roc(testing$overtime~predpr)
plot(roccurve)
roccurve
##
## Call:
## roc.formula(formula = testing$overtime ~ predpr)
##
## Data: predpr in 404704 controls (testing$overtime 0) < 144786 cases (testing$overtime 1).
## Area under the curve: 0.6135
# Second example
predict <- predict(model, newdata=testing, type = "response")
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
ROCRpred <- prediction(predict, testing$overtime)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = F, text.adj = c(-0.2,1.7))
library(purrr)
library(caret)
library(ggplot2)
probs <- seq(0, 1, length.out = 10)
guessing <- map_df(probs, function(p){
y_hat <-
sample(c("0", "1"), length(testing), replace = TRUE, prob=c(p, 1-p)) %>%
factor(levels = levels(bbikemember$overtime))
list(method = "Guessing",
FPR = 1 - specificity(y_hat, bbikemember$overtime),
TPR = sensitivity(y_hat, bbikemember$overtime))
})
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% negative & reference %in% negative: longer object
## length is not a multiple of shorter object length
## Warning in complete.cases(data) & complete.cases(reference): longer object
## length is not a multiple of shorter object length
## Warning in data %in% positive & reference %in% positive: longer object
## length is not a multiple of shorter object length
guessing %>% qplot(FPR, TPR, data =., xlab = "1 - Specificity", ylab = "Sensitivity")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
bbikemember <- bbikemember %>%
filter(gender != 0)
output_tree <- ctree(overtime ~ gender + factor(satsun),
data = bbikemember)
plot(output_tree)